Treamid - PDBBind¶

Filter Data¶

Seperating summary_new.csv file to different Recursion Step Done!

In [1]:
from sklearn.metrics import r2_score
import numpy as np
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt


# Function to filter the CSV file
def filter_csv(input_file_path, output_file_path, recursion_step='1/5'):
    """
    Filters rows in a CSV file where 'Recursion Step Done' equals a specific value.

    Parameters:
    - input_file_path: Path to the input CSV file.
    - output_file_path: Path where the filtered CSV file will be saved.
    - recursion_step: The recursion step value to filter by (default is '1/5').
    """
    # Load the CSV file
    data = pd.read_csv(input_file_path)

    # Filter the dataframe
    filtered_data = data[
        (data['Recursion Step Done'] == recursion_step) &
        (data['Binding Affinity (kcal/mol)'] < 20) &
        (data['Number of clashes'] < 20) &
        (data['Strain Energy'] < 20) &
        (data['Confidence Score'] > -10)
    ]

    # Save the filtered data to a new CSV file
    filtered_data.to_csv(output_file_path, index=False)
    print(f"Filtered data for recursion step {recursion_step} saved to: {output_file_path}")
In [2]:
# Base directory and filename patterns
base_dir = '/fast/AG_Akalin/asarigun/Arcas_Stage_1/ROOF/COMPASS/experiments/PDBBind/data/'
input_filename = 'summary_new.csv'
output_filename_pattern = 'filtered_summary_new_rec_{}.csv'

# Full input file path
input_file_path = base_dir + input_filename

# Loop over the recursion steps
for step in range(1, 6):
    recursion_step = f'{step}/5'
    output_file_path = base_dir + output_filename_pattern.format(step)
    filter_csv(input_file_path, output_file_path, recursion_step=recursion_step)
Filtered data for recursion step 1/5 saved to: /fast/AG_Akalin/asarigun/Arcas_Stage_1/ROOF/COMPASS/experiments/PDBBind/data/filtered_summary_new_rec_1.csv
Filtered data for recursion step 2/5 saved to: /fast/AG_Akalin/asarigun/Arcas_Stage_1/ROOF/COMPASS/experiments/PDBBind/data/filtered_summary_new_rec_2.csv
Filtered data for recursion step 3/5 saved to: /fast/AG_Akalin/asarigun/Arcas_Stage_1/ROOF/COMPASS/experiments/PDBBind/data/filtered_summary_new_rec_3.csv
Filtered data for recursion step 4/5 saved to: /fast/AG_Akalin/asarigun/Arcas_Stage_1/ROOF/COMPASS/experiments/PDBBind/data/filtered_summary_new_rec_4.csv
Filtered data for recursion step 5/5 saved to: /fast/AG_Akalin/asarigun/Arcas_Stage_1/ROOF/COMPASS/experiments/PDBBind/data/filtered_summary_new_rec_5.csv

Read Each Seperated Files¶

In [3]:
# Loop over the recursion steps and read each file
for step in range(1, 6):
    file_path = f'{base_dir}{output_filename_pattern.format(step)}'
    data = pd.read_csv(file_path)
    num_rows = data.shape[0]  
    print(f'Total number of data points for recursion step {step}/5: {num_rows}')
    print(f'First few rows of the dataframe for recursion step {step}/5:')
    display(data.head()) 
    
Total number of data points for recursion step 1/5: 12809
First few rows of the dataframe for recursion step 1/5:
Protein ID Recursion Step Done Ligand Description Binding Affinity (kcal/mol) Number of clashes Strain Energy Confidence Score Rank of sdf _timestamp _runtime _step
0 4a7b_protein_processed 1/5 NaN -7.743852 7 9.233238 -0.15 rank1_confidence-0.15 1.712196e+09 90.840397 8
1 4mt9_protein_processed 1/5 NaN -6.984931 7 9.727376 -0.42 rank1_confidence-0.42 1.712193e+09 73.990098 8
2 4msg_protein_processed 1/5 NaN -7.803800 14 5.416456 -0.21 rank1_confidence-0.21 1.712192e+09 72.087428 8
3 4lwu_protein_processed 1/5 NaN -1.004857 15 5.794271 -0.96 rank1_confidence-0.96 1.712184e+09 53.928325 8
4 3l5f_protein_processed 1/5 NaN -5.983912 4 9.115500 -0.06 rank1_confidence-0.06 1.712203e+09 98.465181 8
Total number of data points for recursion step 2/5: 6713
First few rows of the dataframe for recursion step 2/5:
Protein ID Recursion Step Done Ligand Description Binding Affinity (kcal/mol) Number of clashes Strain Energy Confidence Score Rank of sdf _timestamp _runtime _step
0 3osw_protein_processed 2/5 /p/project/hai_bimsb_dock_drug/Arcas_Stage_1/R... -6.806013 5 11.128419 -0.77 rank1_confidence-0.77 1.712178e+09 47.703313 8
1 2xu5_protein_processed 2/5 /p/project/hai_bimsb_dock_drug/Arcas_Stage_1/R... -5.188495 2 7.886749 0.53 rank1_confidence0.53 1.712178e+09 41.020215 8
2 6skb_protein_processed 2/5 /p/project/hai_bimsb_dock_drug/Arcas_Stage_1/R... -6.449235 7 10.282784 -0.49 rank1_confidence-0.49 1.712190e+09 64.057879 8
3 5jah_protein_processed 2/5 /p/project/hai_bimsb_dock_drug/Arcas_Stage_1/R... -6.638763 17 12.186867 -0.11 rank1_confidence-0.11 1.712188e+09 58.801625 8
4 3oyq_protein_processed 2/5 /p/project/hai_bimsb_dock_drug/Arcas_Stage_1/R... -4.610049 1 9.622163 -0.56 rank1_confidence-0.56 1.712185e+09 57.664332 8
Total number of data points for recursion step 3/5: 4589
First few rows of the dataframe for recursion step 3/5:
Protein ID Recursion Step Done Ligand Description Binding Affinity (kcal/mol) Number of clashes Strain Energy Confidence Score Rank of sdf _timestamp _runtime _step
0 5lz9_protein_processed 3/5 /p/project/hai_bimsb_dock_drug/Arcas_Stage_1/R... -7.200333 10 10.013039 -0.05 rank1_confidence-0.05 1.712193e+09 73.473761 8
1 3lea_protein_processed 3/5 /p/project/hai_bimsb_dock_drug/Arcas_Stage_1/R... -5.868697 15 7.508655 -0.68 rank1_confidence-0.68 1.712173e+09 31.038599 8
2 5qbv_protein_processed 3/5 /p/project/hai_bimsb_dock_drug/Arcas_Stage_1/R... -5.375995 5 8.855248 0.23 rank1_confidence0.23 1.712169e+09 19.483277 8
3 2c9d_protein_processed 3/5 /p/project/hai_bimsb_dock_drug/Arcas_Stage_1/R... -7.083409 16 7.737979 -0.35 rank1_confidence-0.35 1.712175e+09 39.159119 8
4 6c28_protein_processed 3/5 /p/project/hai_bimsb_dock_drug/Arcas_Stage_1/R... -3.430942 6 3.496771 -1.25 rank1_confidence-1.25 1.712179e+09 43.019742 8
Total number of data points for recursion step 4/5: 3528
First few rows of the dataframe for recursion step 4/5:
Protein ID Recursion Step Done Ligand Description Binding Affinity (kcal/mol) Number of clashes Strain Energy Confidence Score Rank of sdf _timestamp _runtime _step
0 4bfy_protein_processed 4/5 /p/project/hai_bimsb_dock_drug/Arcas_Stage_1/R... -6.706119 19 9.650411 -0.49 rank1_confidence-0.49 1.712171e+09 26.649156 8
1 2v86_protein_processed 4/5 /p/project/hai_bimsb_dock_drug/Arcas_Stage_1/R... -4.772636 8 4.767310 -0.84 rank1_confidence-0.84 1.712194e+09 68.381843 8
2 3ant_protein_processed 4/5 /p/project/hai_bimsb_dock_drug/Arcas_Stage_1/R... -6.526974 5 5.131837 -0.69 rank1_confidence-0.69 1.712192e+09 70.197422 8
3 3f1a_protein_processed 4/5 /p/project/hai_bimsb_dock_drug/Arcas_Stage_1/R... -2.818593 15 11.909609 -0.81 rank1_confidence-0.81 1.712175e+09 35.744654 8
4 6cj5_protein_processed 4/5 /p/project/hai_bimsb_dock_drug/Arcas_Stage_1/R... -6.489578 1 9.185916 -0.58 rank1_confidence-0.58 1.712171e+09 27.180906 8
Total number of data points for recursion step 5/5: 2763
First few rows of the dataframe for recursion step 5/5:
Protein ID Recursion Step Done Ligand Description Binding Affinity (kcal/mol) Number of clashes Strain Energy Confidence Score Rank of sdf _timestamp _runtime _step
0 3f18_protein_processed 5/5 /p/project/hai_bimsb_dock_drug/Arcas_Stage_1/R... -6.151046 3 6.017198 -0.32 rank1_confidence-0.32 1.712175e+09 33.212631 8
1 1sts_protein_processed 5/5 /p/project/hai_bimsb_dock_drug/Arcas_Stage_1/R... -5.828997 8 12.178389 -0.87 rank1_confidence-0.87 1.712170e+09 25.128030 8
2 3b8q_protein_processed 5/5 /p/project/hai_bimsb_dock_drug/Arcas_Stage_1/R... -6.206666 15 9.867155 -0.16 rank1_confidence-0.16 1.712179e+09 46.489564 8
3 4az2_protein_processed 5/5 /p/project/hai_bimsb_dock_drug/Arcas_Stage_1/R... -7.062797 5 5.276325 0.35 rank1_confidence0.35 1.712173e+09 31.939743 8
4 5xjm_protein_processed 5/5 /p/project/hai_bimsb_dock_drug/Arcas_Stage_1/R... -5.818565 7 7.974301 -0.30 rank1_confidence-0.30 1.712181e+09 60.047276 8

Data Distributions¶

In [4]:
# Function to plot distributions of metrics from a given dataset
def plot_metrics_distributions(data, figure_title):
    # Set the overall aesthetics
    sns.set(style="whitegrid", palette="pastel")

    # Prepare the figure with a larger size for better readability
    fig, axes = plt.subplots(2, 2, figsize=(16, 12))  # Increased figure size for better legibility
    fig.suptitle(figure_title, fontsize=20, fontweight='bold')

    # Define a common bin size for comparable scales
    common_bins = 120  # Example bin size, adjust based on data distribution

    # List of metrics to plot
    metrics = ['Binding Affinity (kcal/mol)', 'Number of clashes', 'Strain Energy', 'Confidence Score']
    colors = ['olive', 'gold', 'teal', 'purple']

    # Plotting distributions
    for i, metric in enumerate(metrics):
        ax = axes.flat[i]
        sns.histplot(data=data, x=metric, kde=True, ax=ax, color=colors[i], bins=common_bins)
        ax.set_title(metric, fontsize=14)
        ax.grid(True)

        # Calculate mean and std, and annotate on the plots
        mean_value = data[metric].mean()
        std_value = data[metric].std()
        ax.axvline(mean_value, color='magenta', linestyle='dashed', linewidth=2)
        ax.text(mean_value + std_value, 0.7, f'Mean: {mean_value:.2f}\nStd: {std_value:.2f}', color='magenta', ha='left')

    # Adjust layout for a clean look and ensure the titles and labels don't overlap
    plt.tight_layout(rect=[0, 0.03, 1, 0.95])

    # Show plot
    plt.show()


# Loop over the recursion steps and plot for each step
for step in range(1, 6):
    file_path = f'{base_dir}{output_filename_pattern.format(step)}'
    data = pd.read_csv(file_path)
    num_rows = data.shape[0]  
    print(f'Total number of data points for recursion step {step}/5: {num_rows}')
    plot_metrics_distributions(data, f'Distribution of Various Metrics for Recursion Step {step}/5')
Total number of data points for recursion step 1/5: 12809
No description has been provided for this image
Total number of data points for recursion step 2/5: 6713
No description has been provided for this image
Total number of data points for recursion step 3/5: 4589
No description has been provided for this image
Total number of data points for recursion step 4/5: 3528
No description has been provided for this image
Total number of data points for recursion step 5/5: 2763
No description has been provided for this image

Data Distribution Ranges¶

In [5]:
# Function to plot histograms with readable ranges for a given dataset
def plot_histograms_with_readable_ranges(data, title_suffix):
    # Enhancing overall aesthetics
    sns.set(style="whitegrid", palette="pastel")

    # Initializing a larger figure for clearer detail
    fig, axes = plt.subplots(4, 1, figsize=(10, 15))  # Using a vertical layout for better x-axis label readability

    # Titles and customization for improved readability
    fig.suptitle(f'Distribution of Selected Metrics with Readable Ranges - {title_suffix}', fontsize=20, fontweight='bold')

    # Adjusting bin sizes and x-axis limits for clarity
    metrics_info = {
        'Binding Affinity (kcal/mol)': {'bins': 20, 'ax': axes[0], 'xlim': (-20, 20)},
        'Strain Energy': {'bins': 20, 'ax': axes[1], 'xlim': (-20, 20)},
        'Number of clashes': {'bins': 20, 'ax': axes[2], 'xlim': (0, 20)},
        'Confidence Score': {'bins': 20, 'ax': axes[3], 'xlim': (-10, 10)}
    }

    for metric, info in metrics_info.items():
        sns.histplot(data=data, x=metric, kde=True, ax=info['ax'], bins=info['bins'], color='skyblue')
        info['ax'].set_title(metric, fontsize=14)
        info['ax'].set_xlabel(metric, fontsize=12)
        info['ax'].set_ylabel('Frequency', fontsize=12)
        info['ax'].set_xlim(info['xlim'])  # Adjusting x-axis limits for focusing on interesting ranges
        info['ax'].tick_params(axis='x', labelsize=10, rotation=45)

    plt.tight_layout(rect=[0, 0.03, 1, 0.95])
    plt.show()



# Loop over the recursion steps and plot for each step
for step in range(1, 6):
    file_path = f'{base_dir}{output_filename_pattern.format(step)}'
    data = pd.read_csv(file_path)
    num_rows = data.shape[0]  
    print(f'Total number of data points for recursion step {step}/5: {num_rows}')
    plot_histograms_with_readable_ranges(data, f'Recursion Step {step}/5')
Total number of data points for recursion step 1/5: 12809
No description has been provided for this image
Total number of data points for recursion step 2/5: 6713
No description has been provided for this image
Total number of data points for recursion step 3/5: 4589
No description has been provided for this image
Total number of data points for recursion step 4/5: 3528
No description has been provided for this image
Total number of data points for recursion step 5/5: 2763
No description has been provided for this image

Correlation Between Confidence Score vs Physical Scores (Binding Affinity, Strain Energy, Number of Clashes)¶

  • This is all data without selecting range values
In [6]:
def plot_correlations(data, title_suffix):
    # Plotting correlations for the specified pairs of metrics
    fig, axes = plt.subplots(1, 3, figsize=(18, 6))
    fig.suptitle(f'Correlations Between Metrics - {title_suffix}', fontsize=20, fontweight='bold')

    # Binding Affinity (kcal/mol) vs Confidence Score
    sns.scatterplot(data=data, x='Binding Affinity (kcal/mol)', y='Confidence Score', ax=axes[0], color='skyblue')
    axes[0].set_title('Binding Affinity vs Confidence Score', fontsize=14)
    axes[0].set_xlabel('Binding Affinity (kcal/mol)', fontsize=12)
    axes[0].set_ylabel('Confidence Score', fontsize=12)

    # Strain Energy vs Confidence Score
    sns.scatterplot(data=data, x='Strain Energy', y='Confidence Score', ax=axes[1], color='lightgreen')
    axes[1].set_title('Strain Energy vs Confidence Score', fontsize=14)
    axes[1].set_xlabel('Strain Energy', fontsize=12)
    axes[1].set_ylabel('Confidence Score', fontsize=12)

    # Number of clashes vs Confidence Score
    sns.scatterplot(data=data, x='Number of clashes', y='Confidence Score', ax=axes[2], color='salmon')
    axes[2].set_title('Number of Clashes vs Confidence Score', fontsize=14)
    axes[2].set_xlabel('Number of Clashes', fontsize=12)
    axes[2].set_ylabel('Confidence Score', fontsize=12)

    plt.tight_layout(rect=[0, 0.03, 1, 0.95])
    plt.show()



# Loop over the recursion steps and plot correlations for each step
for step in range(1, 6):
    file_path = f'{base_dir}{output_filename_pattern.format(step)}'
    data = pd.read_csv(file_path)
    num_rows = data.shape[0]  
    print(f'Total number of data points for recursion step {step}/5: {num_rows}')
    plot_correlations(data, f'Recursion Step {step}/5')
Total number of data points for recursion step 1/5: 12809
No description has been provided for this image
Total number of data points for recursion step 2/5: 6713
No description has been provided for this image
Total number of data points for recursion step 3/5: 4589
No description has been provided for this image
Total number of data points for recursion step 4/5: 3528
No description has been provided for this image
Total number of data points for recursion step 5/5: 2763
No description has been provided for this image

Correlation Between Confidence Score vs Physical Scores (Binding Affinity, Strain Energy, Number of Clashes)¶

  • This is all data with selecting range values
In [7]:
def filter_and_plot_correlations(data, title_suffix):
    # Filter the data for the given ranges
    filtered_data = data[
        (data['Confidence Score'] >= -5) & (data['Confidence Score'] <= 5) &
        (data['Binding Affinity (kcal/mol)'] >= -20) & (data['Binding Affinity (kcal/mol)'] <= 20) &
        (data['Strain Energy'] >= -20) & (data['Strain Energy'] <= 20)
    ]

    # Plotting correlations for the specified pairs of metrics with all adjusted ranges
    fig, axes = plt.subplots(1, 3, figsize=(18, 6))
    fig.suptitle(f'Correlations Between Metrics (Adjusted Ranges) - {title_suffix}', fontsize=20, fontweight='bold')

    # Binding Affinity (kcal/mol) vs Confidence Score
    sns.scatterplot(data=filtered_data, x='Binding Affinity (kcal/mol)', y='Confidence Score', ax=axes[0], color='skyblue')
    axes[0].set_title('Binding Affinity vs Confidence Score', fontsize=14)

    # Strain Energy vs Confidence Score
    sns.scatterplot(data=filtered_data, x='Strain Energy', y='Confidence Score', ax=axes[1], color='lightgreen')
    axes[1].set_title('Strain Energy vs Confidence Score', fontsize=14)

    # Number of clashes vs Confidence Score
    sns.scatterplot(data=filtered_data, x='Number of clashes', y='Confidence Score', ax=axes[2], color='salmon')
    axes[2].set_title('Number of Clashes vs Confidence Score', fontsize=14)

    plt.tight_layout(rect=[0, 0.03, 1, 0.95])
    plt.show()



# Loop over the recursion steps and apply the filter and plotting function
for step in range(1, 6):
    file_path = f'{base_dir}{output_filename_pattern.format(step)}'
    data = pd.read_csv(file_path)
    num_rows = data.shape[0]  
    print(f'Total number of data points for recursion step {step}/5: {num_rows}')
    filter_and_plot_correlations(data, f'Recursion Step {step}/5')
Total number of data points for recursion step 1/5: 12809
No description has been provided for this image
Total number of data points for recursion step 2/5: 6713
No description has been provided for this image
Total number of data points for recursion step 3/5: 4589
No description has been provided for this image
Total number of data points for recursion step 4/5: 3528
No description has been provided for this image
Total number of data points for recursion step 5/5: 2763
No description has been provided for this image

Correlation Between Confidence Score vs Physical Scores (Binding Affinity, Strain Energy, Number of Clashes)¶

  • This is all data with selecting range values
  • Best lines with R^2 values
In [8]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score
import statsmodels.api as sm

def plot_correlation_with_fit(data, recursion_step):
    # Filter data based on the specified ranges
    filtered_data = data[
        (data['Confidence Score'] >= -5) & (data['Confidence Score'] <= 5) &
        (data['Binding Affinity (kcal/mol)'] >= -20) & (data['Binding Affinity (kcal/mol)'] <= 20) &
        (data['Strain Energy'] >= -20) & (data['Strain Energy'] <= 20)
    ]

    # Preparing the figure
    fig, axes = plt.subplots(1, 3, figsize=(18, 6))
    fig.suptitle(f'Correlations and Linear Fits (Adjusted Ranges) - Recursion Step {recursion_step}', fontsize=20, fontweight='bold')

    # Defining metrics pairs for correlation
    metrics_pairs = [
        ('Binding Affinity (kcal/mol)', 'Confidence Score'),
        ('Strain Energy', 'Confidence Score'),
        ('Number of clashes', 'Confidence Score')
    ]

    colors = ['skyblue', 'lightgreen', 'salmon']

    for i, (x_metric, y_metric) in enumerate(metrics_pairs):
        # Scatter plot
        sns.scatterplot(data=filtered_data, x=x_metric, y=y_metric, ax=axes[i], color=colors[i])
        
        # Robust linear model
        X = sm.add_constant(filtered_data[x_metric])  # Adding a constant for the intercept
        robust_model = sm.RLM(filtered_data[y_metric], X, M=sm.robust.norms.HuberT())
        robust_results = robust_model.fit()
        
        # Calculate R^2 score for robust model
        y_pred_robust = robust_results.predict(X)
        r2_robust = r2_score(filtered_data[y_metric], y_pred_robust)
        
        # Linear model
        coef = np.polyfit(filtered_data[x_metric], filtered_data[y_metric], 1)
        poly1d_fn = np.poly1d(coef) 
        
        # Calculate R^2 score for linear model
        y_pred_linear = poly1d_fn(filtered_data[x_metric])
        r2_linear = r2_score(filtered_data[y_metric], y_pred_linear)
        
        # Plot robust fit and linear fit
        sns.lineplot(x=filtered_data[x_metric], y=y_pred_robust, ax=axes[i], color='red', label=f'Robust R² = {r2_robust:.2f}')
        sns.lineplot(x=filtered_data[x_metric], y=poly1d_fn(filtered_data[x_metric]), ax=axes[i], color='blue', label=f'Linear R² = {r2_linear:.2f}', linestyle='--')
        
        axes[i].set_title(f'{x_metric} vs {y_metric}', fontsize=14)
        axes[i].set_xlabel(x_metric, fontsize=12)
        axes[i].set_ylabel(y_metric, fontsize=12)
        axes[i].legend()

    plt.tight_layout(rect=[0, 0.03, 1, 0.95])
    plt.show()

# Loop over the recursion steps and plot for each step
for step in range(1, 6):
    file_path = f'{base_dir}{output_filename_pattern.format(step)}'
    data = pd.read_csv(file_path)
    num_rows = data.shape[0]  
    print(f'Total number of data points for recursion step {step}/5: {num_rows}')
    plot_correlation_with_fit(data, step)
Total number of data points for recursion step 1/5: 12809
No description has been provided for this image
Total number of data points for recursion step 2/5: 6713
No description has been provided for this image
Total number of data points for recursion step 3/5: 4589
No description has been provided for this image
Total number of data points for recursion step 4/5: 3528
No description has been provided for this image
Total number of data points for recursion step 5/5: 2763
No description has been provided for this image

Comparison of Same Size Recursived Proteins¶

Now we are comparising the the Proteins which completed 5/5 Recursion step. First we seperate the results in Recursion Step Done values. Each csv file must be same number of Protein ID!

In [9]:
def filter_data_for_recursion_step(recursion_step, base_dir, filtered_proteins_file, summary_file_template, output_file_template):
    # Load the file with filtered Protein IDs
    filtered_proteins_df = pd.read_csv(filtered_proteins_file)
    filtered_proteins = filtered_proteins_df['Protein ID']
    
    # Format the file name for the current recursion step
    summary_file_path = summary_file_template.format(recursion_step)
    
    # Load the summary data for the current recursion step
    summary_data = pd.read_csv(summary_file_path)

    filtered_data = data[
        (data['Recursion Step Done'] == recursion_step) &
        (data['Binding Affinity (kcal/mol)'] < 20) &
        (data['Number of clashes'] < 20) &
        (data['Strain Energy'] < 20) &
        (data['Confidence Score'] > -10)
    ]
    
    # Filter the summary data for the current recursion step
    filtered_summary_new = summary_data[summary_data['Protein ID'].isin(filtered_proteins) & 
        (summary_data['Recursion Step Done'] == f'{recursion_step}/5') & 
        (summary_data['Binding Affinity (kcal/mol)'] < 1000)
        
        ]
    
    # Save the filtered data to a new CSV file
    output_file_path = output_file_template.format(recursion_step)
    filtered_summary_new.to_csv(output_file_path, index=False)
    
    print(f"Filtered data for recursion step {recursion_step}/5 saved to: {output_file_path}")

# Base directory and file patterns
base_dir = '/fast/AG_Akalin/asarigun/Arcas_Stage_1/ROOF/COMPASS/experiments/PDBBind/data/'
filtered_proteins_file = base_dir + 'filtered_summary_new_rec_5.csv'
summary_file_template = base_dir + 'summary_new.csv' 
output_file_template = base_dir + 'filtered_proteins_from_summary_rec{}.csv'

# Loop through each recursion step
for step in range(1, 6):
    filter_data_for_recursion_step(step, base_dir, filtered_proteins_file, summary_file_template, output_file_template)
Filtered data for recursion step 1/5 saved to: /fast/AG_Akalin/asarigun/Arcas_Stage_1/ROOF/COMPASS/experiments/PDBBind/data/filtered_proteins_from_summary_rec1.csv
Filtered data for recursion step 2/5 saved to: /fast/AG_Akalin/asarigun/Arcas_Stage_1/ROOF/COMPASS/experiments/PDBBind/data/filtered_proteins_from_summary_rec2.csv
Filtered data for recursion step 3/5 saved to: /fast/AG_Akalin/asarigun/Arcas_Stage_1/ROOF/COMPASS/experiments/PDBBind/data/filtered_proteins_from_summary_rec3.csv
Filtered data for recursion step 4/5 saved to: /fast/AG_Akalin/asarigun/Arcas_Stage_1/ROOF/COMPASS/experiments/PDBBind/data/filtered_proteins_from_summary_rec4.csv
Filtered data for recursion step 5/5 saved to: /fast/AG_Akalin/asarigun/Arcas_Stage_1/ROOF/COMPASS/experiments/PDBBind/data/filtered_proteins_from_summary_rec5.csv

Read Each Seperated Files (Each Shows Recursion Steps)¶

In [10]:
# Loop over the recursion steps and read each file
for step in range(1, 6):
    file_path = f'{output_file_template.format(step)}'
    data = pd.read_csv(file_path)
    num_rows = data.shape[0]  
    print(f'Total number of data points for recursion step {step}/5: {num_rows}')
    print(f'First few rows of the dataframe for recursion step {step}/5:')
    display(data.head())
Total number of data points for recursion step 1/5: 2768
First few rows of the dataframe for recursion step 1/5:
Protein ID Recursion Step Done Ligand Description Binding Affinity (kcal/mol) Number of clashes Strain Energy Confidence Score Rank of sdf _timestamp _runtime _step
0 4mt9_protein_processed 1/5 NaN -6.984931 7 9.727376 -0.42 rank1_confidence-0.42 1.712193e+09 73.990098 8
1 3l5f_protein_processed 1/5 NaN -5.983912 4 9.115500 -0.06 rank1_confidence-0.06 1.712203e+09 98.465181 8
2 5f04_protein_processed 1/5 NaN -8.444917 6 9.143929 -0.55 rank1_confidence-0.55 1.712182e+09 64.459544 8
3 5upe_protein_processed 1/5 NaN -7.604168 4 9.029777 -0.21 rank1_confidence-0.21 1.712190e+09 106.675954 8
4 6gu4_protein_processed 1/5 NaN -8.031396 12 5.412763 0.09 rank1_confidence0.09 1.712186e+09 71.208169 8
Total number of data points for recursion step 2/5: 2766
First few rows of the dataframe for recursion step 2/5:
Protein ID Recursion Step Done Ligand Description Binding Affinity (kcal/mol) Number of clashes Strain Energy Confidence Score Rank of sdf _timestamp _runtime _step
0 2xu5_protein_processed 2/5 /p/project/hai_bimsb_dock_drug/Arcas_Stage_1/R... -5.188495 2 7.886749 0.53 rank1_confidence0.53 1.712178e+09 41.020215 8
1 3zrl_protein_processed 2/5 /p/project/hai_bimsb_dock_drug/Arcas_Stage_1/R... -6.608416 8 9.125015 -0.50 rank1_confidence-0.50 1.712181e+09 52.921666 8
2 1k6c_protein_processed 2/5 /p/project/hai_bimsb_dock_drug/Arcas_Stage_1/R... -6.170387 11 6.698906 -1.32 rank1_confidence-1.32 1.712170e+09 18.585074 8
3 3kmm_protein_processed 2/5 /p/project/hai_bimsb_dock_drug/Arcas_Stage_1/R... -6.495392 9 6.488279 -0.90 rank1_confidence-0.90 1.712179e+09 47.001645 8
4 2i3i_protein_processed 2/5 /p/project/hai_bimsb_dock_drug/Arcas_Stage_1/R... -5.587364 5 8.241349 -0.13 rank1_confidence-0.13 1.712183e+09 49.401926 8
Total number of data points for recursion step 3/5: 2764
First few rows of the dataframe for recursion step 3/5:
Protein ID Recursion Step Done Ligand Description Binding Affinity (kcal/mol) Number of clashes Strain Energy Confidence Score Rank of sdf _timestamp _runtime _step
0 5lz9_protein_processed 3/5 /p/project/hai_bimsb_dock_drug/Arcas_Stage_1/R... -7.200333 10 10.013039 -0.05 rank1_confidence-0.05 1.712193e+09 73.473761 8
1 5qbv_protein_processed 3/5 /p/project/hai_bimsb_dock_drug/Arcas_Stage_1/R... -5.375995 5 8.855248 0.23 rank1_confidence0.23 1.712169e+09 19.483277 8
2 1hvi_protein_processed 3/5 /p/project/hai_bimsb_dock_drug/Arcas_Stage_1/R... -5.589729 9 8.037342 -1.11 rank1_confidence-1.11 1.712193e+09 69.589330 8
3 2vj6_protein_processed 3/5 /p/project/hai_bimsb_dock_drug/Arcas_Stage_1/R... -7.898968 10 7.427671 0.20 rank1_confidence0.20 1.712177e+09 43.562624 8
4 2nwl_protein_processed 3/5 /p/project/hai_bimsb_dock_drug/Arcas_Stage_1/R... -2.177144 11 5.790366 -1.16 rank1_confidence-1.16 1.712185e+09 61.523299 8
Total number of data points for recursion step 4/5: 2763
First few rows of the dataframe for recursion step 4/5:
Protein ID Recursion Step Done Ligand Description Binding Affinity (kcal/mol) Number of clashes Strain Energy Confidence Score Rank of sdf _timestamp _runtime _step
0 2v86_protein_processed 4/5 /p/project/hai_bimsb_dock_drug/Arcas_Stage_1/R... -4.772636 8 4.767310 -0.84 rank1_confidence-0.84 1.712194e+09 68.381843 8
1 3ant_protein_processed 4/5 /p/project/hai_bimsb_dock_drug/Arcas_Stage_1/R... -6.526974 5 5.131837 -0.69 rank1_confidence-0.69 1.712192e+09 70.197422 8
2 6cj5_protein_processed 4/5 /p/project/hai_bimsb_dock_drug/Arcas_Stage_1/R... -6.489578 1 9.185916 -0.58 rank1_confidence-0.58 1.712171e+09 27.180906 8
3 5wae_protein_processed 4/5 /p/project/hai_bimsb_dock_drug/Arcas_Stage_1/R... -4.399126 3 7.590899 -0.56 rank1_confidence-0.56 1.712178e+09 46.549621 8
4 4erf_protein_processed 4/5 /p/project/hai_bimsb_dock_drug/Arcas_Stage_1/R... -5.251450 8 3.243217 -0.28 rank1_confidence-0.28 1.712197e+09 75.663525 8
Total number of data points for recursion step 5/5: 2763
First few rows of the dataframe for recursion step 5/5:
Protein ID Recursion Step Done Ligand Description Binding Affinity (kcal/mol) Number of clashes Strain Energy Confidence Score Rank of sdf _timestamp _runtime _step
0 3f18_protein_processed 5/5 /p/project/hai_bimsb_dock_drug/Arcas_Stage_1/R... -6.151046 3 6.017198 -0.32 rank1_confidence-0.32 1.712175e+09 33.212631 8
1 1sts_protein_processed 5/5 /p/project/hai_bimsb_dock_drug/Arcas_Stage_1/R... -5.828997 8 12.178389 -0.87 rank1_confidence-0.87 1.712170e+09 25.128030 8
2 3b8q_protein_processed 5/5 /p/project/hai_bimsb_dock_drug/Arcas_Stage_1/R... -6.206666 15 9.867155 -0.16 rank1_confidence-0.16 1.712179e+09 46.489564 8
3 4az2_protein_processed 5/5 /p/project/hai_bimsb_dock_drug/Arcas_Stage_1/R... -7.062797 5 5.276325 0.35 rank1_confidence0.35 1.712173e+09 31.939743 8
4 5xjm_protein_processed 5/5 /p/project/hai_bimsb_dock_drug/Arcas_Stage_1/R... -5.818565 7 7.974301 -0.30 rank1_confidence-0.30 1.712181e+09 60.047276 8

Data Distributions of Each Recursive Step¶

In [11]:
# Function to plot distributions of metrics from a given dataset
def plot_metrics_distributions(data, figure_title):
    # Set the overall aesthetics
    sns.set(style="whitegrid", palette="pastel")

    # Prepare the figure with a larger size for better readability
    fig, axes = plt.subplots(2, 2, figsize=(16, 12))  # Increased figure size for better legibility
    fig.suptitle(figure_title, fontsize=20, fontweight='bold')

    # Define a common bin size for comparable scales
    common_bins = 120  # Example bin size, adjust based on your data distribution

    # List of metrics to plot
    metrics = ['Binding Affinity (kcal/mol)', 'Number of clashes', 'Strain Energy', 'Confidence Score']
    colors = ['olive', 'gold', 'teal', 'purple']

    # Plotting distributions
    for i, metric in enumerate(metrics):
        ax = axes.flat[i]
        sns.histplot(data=data, x=metric, kde=True, ax=ax, color=colors[i], bins=common_bins)
        ax.set_title(metric, fontsize=14)
        ax.grid(True)

        # Calculate mean and std, and annotate on the plots
        mean_value = data[metric].mean()
        std_value = data[metric].std()
        ax.axvline(mean_value, color='magenta', linestyle='dashed', linewidth=2)
        ax.text(mean_value + std_value, 0.7, f'Mean: {mean_value:.2f}\nStd: {std_value:.2f}', color='magenta', ha='left')

    # Adjust layout for a clean look and ensure the titles and labels don't overlap
    plt.tight_layout(rect=[0, 0.03, 1, 0.95])

    # Show plot
    plt.show()


# Loop over the recursion steps and plot for each step
for step in range(1, 6):
    file_path = f'{output_file_template.format(step)}'
    data = pd.read_csv(file_path)
    num_rows = data.shape[0]  
    print(f'Total number of data points for recursion step {step}/5: {num_rows}')
    plot_metrics_distributions(data, f'Distribution of Various Metrics for Recursion Step {step}/5')
Total number of data points for recursion step 1/5: 2768
No description has been provided for this image
Total number of data points for recursion step 2/5: 2766
No description has been provided for this image
Total number of data points for recursion step 3/5: 2764
No description has been provided for this image
Total number of data points for recursion step 4/5: 2763
No description has been provided for this image
Total number of data points for recursion step 5/5: 2763
No description has been provided for this image

Data Distribution Ranges of Each Recursive Step¶

In [12]:
# Function to plot histograms with readable ranges for a given dataset
def plot_histograms_with_readable_ranges(data, title_suffix):
    # Enhancing overall aesthetics
    sns.set(style="whitegrid", palette="pastel")

    # Initializing a larger figure for clearer detail
    fig, axes = plt.subplots(4, 1, figsize=(10, 15))  # Using a vertical layout for better x-axis label readability

    # Titles and customization for improved readability
    fig.suptitle(f'Distribution of Selected Metrics with Readable Ranges - {title_suffix}', fontsize=20, fontweight='bold')

    # Adjusting bin sizes and x-axis limits for clarity
    metrics_info = {
        'Binding Affinity (kcal/mol)': {'bins': 20, 'ax': axes[0], 'xlim': (-20, 20)},
        'Strain Energy': {'bins': 20, 'ax': axes[1], 'xlim': (-20, 20)},
        'Number of clashes': {'bins': 20, 'ax': axes[2], 'xlim': (0, 20)},
        'Confidence Score': {'bins': 20, 'ax': axes[3], 'xlim': (-10, 10)}
    }

    for metric, info in metrics_info.items():
        sns.histplot(data=data, x=metric, kde=True, ax=info['ax'], bins=info['bins'], color='skyblue')
        info['ax'].set_title(metric, fontsize=14)
        info['ax'].set_xlabel(metric, fontsize=12)
        info['ax'].set_ylabel('Frequency', fontsize=12)
        info['ax'].set_xlim(info['xlim'])  # Adjusting x-axis limits for focusing on interesting ranges
        info['ax'].tick_params(axis='x', labelsize=10, rotation=45)

    plt.tight_layout(rect=[0, 0.03, 1, 0.95])
    plt.show()



# Loop over the recursion steps and plot for each step
for step in range(1, 6):
    file_path = f'{output_file_template.format(step)}'
    data = pd.read_csv(file_path)
    num_rows = data.shape[0]  
    print(f'Total number of data points for recursion step {step}/5: {num_rows}')
    plot_histograms_with_readable_ranges(data, f'Recursion Step {step}/5')
Total number of data points for recursion step 1/5: 2768
No description has been provided for this image
Total number of data points for recursion step 2/5: 2766
No description has been provided for this image
Total number of data points for recursion step 3/5: 2764
No description has been provided for this image
Total number of data points for recursion step 4/5: 2763
No description has been provided for this image
Total number of data points for recursion step 5/5: 2763
No description has been provided for this image

Correlation Between Confidence Score vs Physical Scores (Binding Affinity, Strain Energy, Number of Clashes) of Each Recursive Step¶

  • This is all data without selecting range values
In [13]:
def plot_correlations(data, title_suffix):
    # Plotting correlations for the specified pairs of metrics
    fig, axes = plt.subplots(1, 3, figsize=(18, 6))
    fig.suptitle(f'Correlations Between Metrics - {title_suffix}', fontsize=20, fontweight='bold')

    # Binding Affinity (kcal/mol) vs Confidence Score
    sns.scatterplot(data=data, x='Binding Affinity (kcal/mol)', y='Confidence Score', ax=axes[0], color='skyblue')
    axes[0].set_title('Binding Affinity vs Confidence Score', fontsize=14)
    axes[0].set_xlabel('Binding Affinity (kcal/mol)', fontsize=12)
    axes[0].set_ylabel('Confidence Score', fontsize=12)

    # Strain Energy vs Confidence Score
    sns.scatterplot(data=data, x='Strain Energy', y='Confidence Score', ax=axes[1], color='lightgreen')
    axes[1].set_title('Strain Energy vs Confidence Score', fontsize=14)
    axes[1].set_xlabel('Strain Energy', fontsize=12)
    axes[1].set_ylabel('Confidence Score', fontsize=12)

    # Number of clashes vs Confidence Score
    sns.scatterplot(data=data, x='Number of clashes', y='Confidence Score', ax=axes[2], color='salmon')
    axes[2].set_title('Number of Clashes vs Confidence Score', fontsize=14)
    axes[2].set_xlabel('Number of Clashes', fontsize=12)
    axes[2].set_ylabel('Confidence Score', fontsize=12)

    plt.tight_layout(rect=[0, 0.03, 1, 0.95])
    plt.show()



# Loop over the recursion steps and plot correlations for each step
for step in range(1, 6):
    file_path = f'{output_file_template.format(step)}'
    data = pd.read_csv(file_path)
    num_rows = data.shape[0]  
    print(f'Total number of data points for recursion step {step}/5: {num_rows}')
    plot_correlations(data, f'Recursion Step {step}/5')
Total number of data points for recursion step 1/5: 2768
No description has been provided for this image
Total number of data points for recursion step 2/5: 2766
No description has been provided for this image
Total number of data points for recursion step 3/5: 2764
No description has been provided for this image
Total number of data points for recursion step 4/5: 2763
No description has been provided for this image
Total number of data points for recursion step 5/5: 2763
No description has been provided for this image

Correlation Between Confidence Score vs Physical Scores (Binding Affinity, Strain Energy, Number of Clashes) of Each Recursive Step¶

  • This is all data with selecting range values
In [14]:
def filter_and_plot_correlations(data, title_suffix):
    # Filter the data for the given ranges
    filtered_data = data[
        (data['Confidence Score'] >= -5) & (data['Confidence Score'] <= 5) &
        (data['Binding Affinity (kcal/mol)'] >= -20) & (data['Binding Affinity (kcal/mol)'] <= 20) &
        (data['Strain Energy'] >= -20) & (data['Strain Energy'] <= 20)
    ]

    # Plotting correlations for the specified pairs of metrics with all adjusted ranges
    fig, axes = plt.subplots(1, 3, figsize=(18, 6))
    fig.suptitle(f'Correlations Between Metrics (Adjusted Ranges) - {title_suffix}', fontsize=20, fontweight='bold')

    # Binding Affinity (kcal/mol) vs Confidence Score
    sns.scatterplot(data=filtered_data, x='Binding Affinity (kcal/mol)', y='Confidence Score', ax=axes[0], color='skyblue')
    axes[0].set_title('Binding Affinity vs Confidence Score', fontsize=14)

    # Strain Energy vs Confidence Score
    sns.scatterplot(data=filtered_data, x='Strain Energy', y='Confidence Score', ax=axes[1], color='lightgreen')
    axes[1].set_title('Strain Energy vs Confidence Score', fontsize=14)

    # Number of clashes vs Confidence Score
    sns.scatterplot(data=filtered_data, x='Number of clashes', y='Confidence Score', ax=axes[2], color='salmon')
    axes[2].set_title('Number of Clashes vs Confidence Score', fontsize=14)

    plt.tight_layout(rect=[0, 0.03, 1, 0.95])
    plt.show()



# Loop over the recursion steps and apply the filter and plotting function
for step in range(1, 6):
    file_path = f'{output_file_template.format(step)}'
    data = pd.read_csv(file_path)
    num_rows = data.shape[0]  
    print(f'Total number of data points for recursion step {step}/5: {num_rows}')
    filter_and_plot_correlations(data, f'Recursion Step {step}/5')
Total number of data points for recursion step 1/5: 2768
No description has been provided for this image
Total number of data points for recursion step 2/5: 2766
No description has been provided for this image
Total number of data points for recursion step 3/5: 2764
No description has been provided for this image
Total number of data points for recursion step 4/5: 2763
No description has been provided for this image
Total number of data points for recursion step 5/5: 2763
No description has been provided for this image

Correlation Between Confidence Score vs Physical Scores (Binding Affinity, Strain Energy, Number of Clashes) of Each Recursive Step¶

  • This is all data with selecting range values
  • Best lines with R^2 values
In [15]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score
import statsmodels.api as sm

def plot_correlation_with_fit(data, recursion_step):
    # Filter data based on the specified ranges
    filtered_data = data[
        (data['Confidence Score'] >= -5) & (data['Confidence Score'] <= 5) &
        (data['Binding Affinity (kcal/mol)'] >= -20) & (data['Binding Affinity (kcal/mol)'] <= 20) &
        (data['Strain Energy'] >= -20) & (data['Strain Energy'] <= 20)
    ]

    # Preparing the figure
    fig, axes = plt.subplots(1, 3, figsize=(18, 6))
    fig.suptitle(f'Correlations and Linear Fits (Adjusted Ranges) - Recursion Step {recursion_step}', fontsize=20, fontweight='bold')

    # Defining metrics pairs for correlation
    metrics_pairs = [
        ('Binding Affinity (kcal/mol)', 'Confidence Score'),
        ('Strain Energy', 'Confidence Score'),
        ('Number of clashes', 'Confidence Score')
    ]

    colors = ['skyblue', 'lightgreen', 'salmon']

    for i, (x_metric, y_metric) in enumerate(metrics_pairs):
        # Scatter plot
        sns.scatterplot(data=filtered_data, x=x_metric, y=y_metric, ax=axes[i], color=colors[i])
        
        # Robust linear model
        X = sm.add_constant(filtered_data[x_metric])  # Adding a constant for the intercept
        robust_model = sm.RLM(filtered_data[y_metric], X, M=sm.robust.norms.HuberT())
        robust_results = robust_model.fit()
        
        # Calculate R^2 score for robust model
        y_pred_robust = robust_results.predict(X)
        r2_robust = r2_score(filtered_data[y_metric], y_pred_robust)
        
        # Linear model
        coef = np.polyfit(filtered_data[x_metric], filtered_data[y_metric], 1)
        poly1d_fn = np.poly1d(coef) 
        
        # Calculate R^2 score for linear model
        y_pred_linear = poly1d_fn(filtered_data[x_metric])
        r2_linear = r2_score(filtered_data[y_metric], y_pred_linear)
        
        # Plot robust fit and linear fit
        sns.lineplot(x=filtered_data[x_metric], y=y_pred_robust, ax=axes[i], color='red', label=f'Robust R² = {r2_robust:.2f}')
        sns.lineplot(x=filtered_data[x_metric], y=poly1d_fn(filtered_data[x_metric]), ax=axes[i], color='blue', label=f'Linear R² = {r2_linear:.2f}', linestyle='--')
        
        axes[i].set_title(f'{x_metric} vs {y_metric}', fontsize=14)
        axes[i].set_xlabel(x_metric, fontsize=12)
        axes[i].set_ylabel(y_metric, fontsize=12)
        axes[i].legend()

    plt.tight_layout(rect=[0, 0.03, 1, 0.95])
    plt.show()


# Loop over the recursion steps and plot for each step
for step in range(1, 6):
    file_path = f'{output_file_template.format(step)}'
    data = pd.read_csv(file_path)
    num_rows = data.shape[0]  
    print(f'Total number of data points for recursion step {step}/5: {num_rows}')
    plot_correlation_with_fit(data, step)
Total number of data points for recursion step 1/5: 2768
No description has been provided for this image
Total number of data points for recursion step 2/5: 2766
No description has been provided for this image
Total number of data points for recursion step 3/5: 2764
No description has been provided for this image
Total number of data points for recursion step 4/5: 2763
No description has been provided for this image
Total number of data points for recursion step 5/5: 2763
No description has been provided for this image

Box Plots (Binding Affinity, Strain Energy, Number of Clashes) of Each Recursive Step¶

In [16]:
# Function to remove outliers using the IQR method
def remove_outliers(df, column_name):
    Q1 = df[column_name].quantile(0.1)
    Q3 = df[column_name].quantile(0.9)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return df[(df[column_name] >= lower_bound) & (df[column_name] <= upper_bound)]

# Function to plot a fancy boxplot for a given column
def plot_fancy_boxplot(data, column_name, recursion_steps, color):
    cleaned_data = remove_outliers(data, column_name)
    plt.figure(figsize=(12, 8))
    ax = sns.boxplot(x='Recursion Step', y=column_name, data=cleaned_data, fliersize=4, linewidth=2.5, color=color)
    sns.stripplot(x='Recursion Step', y=column_name, data=cleaned_data, jitter=True, size=4, color='gray', alpha=0.5)
    plt.title(f'{column_name} across Recursion Steps (Without Outliers)', fontsize=18)
    plt.xlabel('Recursion Step', fontsize=14)
    plt.ylabel(column_name, fontsize=14)
    plt.xticks(fontsize=12)
    plt.yticks(fontsize=12)
    sns.despine(offset=10, trim=True)
    plt.tight_layout()
    plt.show()

# Load the data and concatenate it with recursion step labels
base_dir = '/fast/AG_Akalin/asarigun/Arcas_Stage_1/ROOF/COMPASS/experiments/PDBBind/data/'
output_file_template = 'filtered_proteins_from_summary_rec{}.csv'
all_data = pd.DataFrame()

for step in range(1, 6):
    file_path = f'{base_dir}{output_file_template.format(step)}'
    df = pd.read_csv(file_path)
    df['Recursion Step'] = f'{step}/5'
    all_data = pd.concat([all_data, df])

# List of columns to create boxplots for
columns_to_plot = [
    'Binding Affinity (kcal/mol)', 
    'Confidence Score', 
    'Strain Energy', 
    'Number of clashes'
]

colors = {
    'Binding Affinity (kcal/mol)': 'skyblue', 
    'Confidence Score': 'lightgreen', 
    'Strain Energy': 'lightcoral', 
    'Number of clashes': 'violet'
}

# Plot fancy boxplots for all specified columns with colors
for column in columns_to_plot:
    plot_fancy_boxplot(all_data, column, range(1, 6), colors[column])
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image

Violin Plots (Binding Affinity, Strain Energy, Number of Clashes) of Each Recursive Step¶

In [17]:
# Function to plot a fancy violin plot for a given column
def plot_fancy_violin_plot(data, column_name, recursion_steps, color):
    cleaned_data = remove_outliers(data, column_name)
    plt.figure(figsize=(12, 8))
    sns.violinplot(x='Recursion Step', y=column_name, data=cleaned_data, inner='quartile', linewidth=2.5, color=color)
    sns.stripplot(x='Recursion Step', y=column_name, data=cleaned_data, jitter=True, size=4, color='grey', alpha=0.6)
    plt.title(f'{column_name} across Recursion Steps (Without Outliers)', fontsize=18)
    plt.xlabel('Recursion Step', fontsize=14)
    plt.ylabel(column_name, fontsize=14)
    plt.xticks(fontsize=12)
    plt.yticks(fontsize=12)
    sns.despine(offset=10, trim=True)
    plt.tight_layout()
    plt.show()

# Assuming 'base_dir' and 'output_file_template' have been defined and the data has been loaded.
# Assuming 'all_data' DataFrame is ready and contains 'Recursion Step' column

# List of columns to create violin plots for
columns_to_plot = [
    'Binding Affinity (kcal/mol)', 
    'Confidence Score', 
    'Strain Energy', 
    'Number of clashes'
]

colors = {
    'Binding Affinity (kcal/mol)': 'skyblue', 
    'Confidence Score': 'lightgreen', 
    'Strain Energy': 'lightcoral', 
    'Number of clashes': 'violet'
}

# Plot fancy boxplots for all specified columns with colors
for column in columns_to_plot:
    plot_fancy_violin_plot(all_data, column, range(1, 6), colors[column])
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image

Comparative Analysis (t-tests, ANOVA, Kruskal-Wallis test, Tukey HSD test)¶

In [18]:
import scipy.stats as stats

# Function to perform t-tests between recursion steps for a given column
def perform_t_tests(data, column_name):
    unique_steps = data['Recursion Step'].unique()
    for i in range(len(unique_steps)):
        for j in range(i+1, len(unique_steps)):
            step_i = data[data['Recursion Step'] == unique_steps[i]][column_name]
            step_j = data[data['Recursion Step'] == unique_steps[j]][column_name]
            t_stat, p_val = stats.ttest_ind(step_i, step_j, equal_var=False)
            print(f"T-test between {unique_steps[i]} and {unique_steps[j]} for {column_name}:")
            print(f"T-statistic: {t_stat}, P-value: {p_val}")
            print("\n")

# Function to perform ANOVA across all recursion steps for a given column
def perform_anova(data, column_name):
    data_groups = [data[data['Recursion Step'] == step][column_name] for step in data['Recursion Step'].unique()]
    f_stat, p_val = stats.f_oneway(*data_groups)
    print(f"ANOVA for {column_name}:")
    print(f"F-statistic: {f_stat}, P-value: {p_val}")
    print("\n")

# Function to perform non-parametric Kruskal-Wallis test across all recursion steps for a given column
def perform_kruskal_wallis(data, column_name):
    data_groups = [data[data['Recursion Step'] == step][column_name] for step in data['Recursion Step'].unique()]
    h_stat, p_val = stats.kruskal(*data_groups)
    print(f"Kruskal-Wallis test for {column_name}:")
    print(f"H-statistic: {h_stat}, P-value: {p_val}")
    print("\n")

# Assuming 'all_data' DataFrame is ready and contains 'Recursion Step' column


column_to_analyze = [
    'Binding Affinity (kcal/mol)', 
    'Confidence Score', 
    'Strain Energy', 
    'Number of clashes'
]

for column in column_to_analyze:
    # Perform t-tests
    perform_t_tests(all_data, column)

    # Perform ANOVA
    perform_anova(all_data, column)

    # Perform Kruskal-Wallis test
    perform_kruskal_wallis(all_data, column)
T-test between 1/5 and 2/5 for Binding Affinity (kcal/mol):
T-statistic: 0.36898556049411196, P-value: 0.7121537928083277


T-test between 1/5 and 3/5 for Binding Affinity (kcal/mol):
T-statistic: 0.2329064062917231, P-value: 0.8158434981121949


T-test between 1/5 and 4/5 for Binding Affinity (kcal/mol):
T-statistic: 0.4531290450527204, P-value: 0.650475128591714


T-test between 1/5 and 5/5 for Binding Affinity (kcal/mol):
T-statistic: -1.5127641352192291, P-value: 0.13039850905571027


T-test between 2/5 and 3/5 for Binding Affinity (kcal/mol):
T-statistic: -0.16430827239685758, P-value: 0.8694944783543672


T-test between 2/5 and 4/5 for Binding Affinity (kcal/mol):
T-statistic: 0.09925940992664334, P-value: 0.9209359051841861


T-test between 2/5 and 5/5 for Binding Affinity (kcal/mol):
T-statistic: -2.175000921544778, P-value: 0.029672907378204653


T-test between 3/5 and 4/5 for Binding Affinity (kcal/mol):
T-statistic: 0.26470682645715254, P-value: 0.7912451937911257


T-test between 3/5 and 5/5 for Binding Affinity (kcal/mol):
T-statistic: -2.0360095269038796, P-value: 0.04179786691038868


T-test between 4/5 and 5/5 for Binding Affinity (kcal/mol):
T-statistic: -2.2721968114625497, P-value: 0.023113476650745327


ANOVA for Binding Affinity (kcal/mol):
F-statistic: 1.6557797274380939, P-value: 0.1572631946602122


Kruskal-Wallis test for Binding Affinity (kcal/mol):
H-statistic: 3.2204392362837098, P-value: 0.5216359983730945


T-test between 1/5 and 2/5 for Confidence Score:
T-statistic: -0.15066092884768245, P-value: 0.8802486885982177


T-test between 1/5 and 3/5 for Confidence Score:
T-statistic: 0.0974582340208248, P-value: 0.9223660326918778


T-test between 1/5 and 4/5 for Confidence Score:
T-statistic: -1.2021169715379008, P-value: 0.22937295535424126


T-test between 1/5 and 5/5 for Confidence Score:
T-statistic: 0.45502112776988995, P-value: 0.6491119113901627


T-test between 2/5 and 3/5 for Confidence Score:
T-statistic: 0.2486837271791837, P-value: 0.8036146754581329


T-test between 2/5 and 4/5 for Confidence Score:
T-statistic: -1.0728082156606968, P-value: 0.2834070226204205


T-test between 2/5 and 5/5 for Confidence Score:
T-statistic: 0.6048050616210732, P-value: 0.5453333696863079


T-test between 3/5 and 4/5 for Confidence Score:
T-statistic: -1.2890862098611735, P-value: 0.19742563656462017


T-test between 3/5 and 5/5 for Confidence Score:
T-statistic: 0.3598016692247383, P-value: 0.7190092149271758


T-test between 4/5 and 5/5 for Confidence Score:
T-statistic: 1.5884891666047898, P-value: 0.11223611580529022


ANOVA for Confidence Score:
F-statistic: 0.86888165238881, P-value: 0.4816376788466181


Kruskal-Wallis test for Confidence Score:
H-statistic: 1.2483887886284888, P-value: 0.8700692621758888


T-test between 1/5 and 2/5 for Strain Energy:
T-statistic: -0.7319547567507239, P-value: 0.46422721577452497


T-test between 1/5 and 3/5 for Strain Energy:
T-statistic: -0.5132622222426472, P-value: 0.6077884132139629


T-test between 1/5 and 4/5 for Strain Energy:
T-statistic: -0.4822940091033011, P-value: 0.6296161947652388


T-test between 1/5 and 5/5 for Strain Energy:
T-statistic: -0.48009629133263887, P-value: 0.6311781615396754


T-test between 2/5 and 3/5 for Strain Energy:
T-statistic: 0.21623722388874564, P-value: 0.8288108200331613


T-test between 2/5 and 4/5 for Strain Energy:
T-statistic: 0.2446543598456842, P-value: 0.8067331571943515


T-test between 2/5 and 5/5 for Strain Energy:
T-statistic: 0.20210351292805548, P-value: 0.8398433899859543


T-test between 3/5 and 4/5 for Strain Energy:
T-statistic: 0.029091411838153763, P-value: 0.976792736053002


T-test between 3/5 and 5/5 for Strain Energy:
T-statistic: 0.0001511430182507797, P-value: 0.9998794108632965


T-test between 4/5 and 5/5 for Strain Energy:
T-statistic: -0.0270598854097343, P-value: 0.9784129609086499


ANOVA for Strain Energy:
F-statistic: 0.13782524944989935, P-value: 0.968315439830812


Kruskal-Wallis test for Strain Energy:
H-statistic: 1.9636033513015718, P-value: 0.7424532988249419


T-test between 1/5 and 2/5 for Number of clashes:
T-statistic: -0.39527331902194923, P-value: 0.6926564286386435


T-test between 1/5 and 3/5 for Number of clashes:
T-statistic: 1.1305238829657684, P-value: 0.25830457195463846


T-test between 1/5 and 4/5 for Number of clashes:
T-statistic: -0.8255911580017684, P-value: 0.40907167376338827


T-test between 1/5 and 5/5 for Number of clashes:
T-statistic: -8.307516039441273, P-value: 1.2387584781813465e-16


T-test between 2/5 and 3/5 for Number of clashes:
T-statistic: 1.5267133888223372, P-value: 0.12688951090686226


T-test between 2/5 and 4/5 for Number of clashes:
T-statistic: -0.42865476661795643, P-value: 0.6681911890216506


T-test between 2/5 and 5/5 for Number of clashes:
T-statistic: -7.969065339604163, P-value: 1.9553513590135814e-15


T-test between 3/5 and 4/5 for Number of clashes:
T-statistic: -1.9628420446240016, P-value: 0.049714630879825625


T-test between 3/5 and 5/5 for Number of clashes:
T-statistic: -9.288209390147173, P-value: 2.2696235084951756e-20


T-test between 4/5 and 5/5 for Number of clashes:
T-statistic: -7.626390730307421, P-value: 2.861759313341065e-14


ANOVA for Number of clashes:
F-statistic: 34.10450911333242, P-value: 2.2825366164770107e-28


Kruskal-Wallis test for Number of clashes:
H-statistic: 44.855516114925344, P-value: 4.260682422770966e-09


In [19]:
import pandas as pd
import scipy.stats as stats
import seaborn as sns
import matplotlib.pyplot as plt


from statsmodels.stats.multicomp import pairwise_tukeyhsd
import pandas as pd

# Assuming 'all_data' is your DataFrame and it includes 'Recursion Step' and the metric of interest

def tukey_hsd_posthoc_test(data, column_name):
    # Perform the Tukey HSD test
    tukey_result = pairwise_tukeyhsd(endog=data[column_name], 
                                     groups=data['Recursion Step'], 
                                     alpha=0.05)  # alpha is the significance level
    
    # Print the summary of the test results
    print(tukey_result)
    
    # Plot the results
    tukey_result.plot_simultaneous()
    plt.title(f'Tukey HSD test for {column_name}')
    plt.show()
    
# Load your dataset
# Ensure 'all_data' DataFrame is prepared with 'Recursion Step' and the metrics columns

# Define a function to annotate statistical significance on plots
def annotate_significance(ax, x1, x2, max_y, text):
    y = max_y + max_y * 0.02  # Slightly above the highest data point
    ax.plot([x1, x1, x2, x2], [y, y+1, y+1, y], lw=1.5, c='black')
    ax.text((x1+x2)*.5, y+1, text, ha='center', va='bottom', color='black')

# Function to visualize and statistically compare groups
def visualize_and_compare(data, column_name):
    cleaned_data = remove_outliers(data, column_name)
    # First, visualize with boxplot
    plt.figure(figsize=(12, 8))
    ax = sns.boxplot(x='Recursion Step', y=column_name, data=cleaned_data, palette="Set3")
    sns.stripplot(x='Recursion Step', y=column_name, data=cleaned_data, color='grey', alpha=0.6, jitter=True)
    plt.title(f'{column_name} across Recursion Steps')
    plt.show()
    
    # Then, visualize with violin plot
    plt.figure(figsize=(12, 8))
    sns.violinplot(x='Recursion Step', y=column_name, data=cleaned_data, inner='quartile', palette="Set2")
    plt.title(f'Distribution of {column_name} across Recursion Steps')
    plt.show()
    
    # Perform ANOVA or Kruskal-Wallis test
    groups = [group[column_name].dropna() for name, group in data.groupby('Recursion Step')]
    if all(group.size > 1 for group in groups):  # Ensure all groups have more than one observation
        f_val, p_val = stats.f_oneway(*groups)
        print(f"ANOVA for {column_name}: F-value = {f_val}, P-value = {p_val}")
        
        if p_val < 0.05:
            print("Significant differences found. Consider conducting post-hoc tests for pairwise comparisons.")
            print("Conducting post-hoc test...")
            tukey_hsd_posthoc_test(cleaned_data, column_name)
        else:
            print("No significant differences found across Recursion Steps.")
    else:
        print("Not all groups have sufficient data for ANOVA.")



column_to_analyze = [
    'Binding Affinity (kcal/mol)', 
    'Confidence Score', 
    'Strain Energy', 
    'Number of clashes'
]

for column in column_to_analyze:
    visualize_and_compare(all_data, column)

# Note: This is a basic implementation. For post-hoc tests, consider using Tukey's HSD or pairwise t-tests with Bonferroni correction.
/tmp/ipykernel_139294/498102362.py:40: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  ax = sns.boxplot(x='Recursion Step', y=column_name, data=cleaned_data, palette="Set3")
No description has been provided for this image
/tmp/ipykernel_139294/498102362.py:47: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.violinplot(x='Recursion Step', y=column_name, data=cleaned_data, inner='quartile', palette="Set2")
No description has been provided for this image
ANOVA for Binding Affinity (kcal/mol): F-value = 1.6557797274380939, P-value = 0.1572631946602122
No significant differences found across Recursion Steps.
/tmp/ipykernel_139294/498102362.py:40: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  ax = sns.boxplot(x='Recursion Step', y=column_name, data=cleaned_data, palette="Set3")
No description has been provided for this image
/tmp/ipykernel_139294/498102362.py:47: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.violinplot(x='Recursion Step', y=column_name, data=cleaned_data, inner='quartile', palette="Set2")
No description has been provided for this image
ANOVA for Confidence Score: F-value = 0.86888165238881, P-value = 0.4816376788466181
No significant differences found across Recursion Steps.
/tmp/ipykernel_139294/498102362.py:40: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  ax = sns.boxplot(x='Recursion Step', y=column_name, data=cleaned_data, palette="Set3")
No description has been provided for this image
/tmp/ipykernel_139294/498102362.py:47: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.violinplot(x='Recursion Step', y=column_name, data=cleaned_data, inner='quartile', palette="Set2")
No description has been provided for this image
ANOVA for Strain Energy: F-value = 0.13782524944989935, P-value = 0.968315439830812
No significant differences found across Recursion Steps.
/tmp/ipykernel_139294/498102362.py:40: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  ax = sns.boxplot(x='Recursion Step', y=column_name, data=cleaned_data, palette="Set3")
No description has been provided for this image
/tmp/ipykernel_139294/498102362.py:47: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.violinplot(x='Recursion Step', y=column_name, data=cleaned_data, inner='quartile', palette="Set2")
No description has been provided for this image
ANOVA for Number of clashes: F-value = 34.10450911333242, P-value = 2.2825366164770107e-28
Significant differences found. Consider conducting post-hoc tests for pairwise comparisons.
Conducting post-hoc test...
Multiple Comparison of Means - Tukey HSD, FWER=0.05
===================================================
group1 group2 meandiff p-adj   lower  upper  reject
---------------------------------------------------
   1/5    2/5    0.046 0.9859 -0.1976 0.2896  False
   1/5    3/5  -0.0812 0.8935 -0.3248 0.1624  False
   1/5    4/5   0.0817 0.8912 -0.1619 0.3254  False
   1/5    5/5    0.832    0.0  0.5884 1.0756   True
   2/5    3/5  -0.1272 0.6121 -0.3708 0.1165  False
   2/5    4/5   0.0357 0.9946 -0.2079 0.2794  False
   2/5    5/5    0.786    0.0  0.5424 1.0297   True
   3/5    4/5   0.1629 0.3598 -0.0808 0.4066  False
   3/5    5/5   0.9132    0.0  0.6695 1.1569   True
   4/5    5/5   0.7503    0.0  0.5065  0.994   True
---------------------------------------------------
No description has been provided for this image

PCA¶

In [20]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Assuming 'all_data' is your DataFrame

# Outlier removal function
def remove_outliers(df, column_name):
    Q1 = df[column_name].quantile(0.1)
    Q3 = df[column_name].quantile(0.9)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return df[(df[column_name] >= lower_bound) & (df[column_name] <= upper_bound)]

# List of features to include in PCA
features = ['Binding Affinity (kcal/mol)', 'Confidence Score', 'Strain Energy', 'Number of clashes']

# Apply outlier removal for each feature
cleaned_data = pd.DataFrame()
for feature in features:
    if cleaned_data.empty:
        cleaned_data = remove_outliers(all_data, feature)
    else:
        cleaned_data = remove_outliers(cleaned_data, feature)

# Standardizing the features
x = cleaned_data.loc[:, features].values
x = StandardScaler().fit_transform(x)

# Performing PCA
pca = PCA(n_components=2)  # You can adjust n_components based on your needs
principalComponents = pca.fit_transform(x)

# Create a DataFrame with the principal components
principalDf = pd.DataFrame(data=principalComponents, columns=['Principal Component 1', 'Principal Component 2'])

# Visualizing the PCA results
plt.figure(figsize=(8,6))
plt.scatter(principalDf['Principal Component 1'], principalDf['Principal Component 2'])
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.title('2 component PCA')
plt.grid(True)
plt.show()

# Displaying explained variance ratio
print(f"Explained variance ratio: {pca.explained_variance_ratio_}")
No description has been provided for this image
Explained variance ratio: [0.37282531 0.25644431]

One-Sample t-Test¶

In [21]:
from scipy import stats

# One-sample t-test example
def one_sample_t_test(data, column_name, population_mean):
    """
    Test if the mean of the sample in `column_name` is equal to `population_mean`.
    """
    t_stat, p_val = stats.ttest_1samp(data[column_name].dropna(), population_mean)
    print(f"One-sample t-test for {column_name}:")
    print(f"T-statistic: {t_stat}, P-value: {p_val}")

features = ['Binding Affinity (kcal/mol)', 'Confidence Score', 'Strain Energy', 'Number of clashes']


for feat in features:
    one_sample_t_test(all_data, feat, 0)
One-sample t-test for Binding Affinity (kcal/mol):
T-statistic: -495.84507190316606, P-value: 0.0
One-sample t-test for Confidence Score:
T-statistic: -114.43554938475592, P-value: 0.0
One-sample t-test for Strain Energy:
T-statistic: 432.11095456705647, P-value: 0.0
One-sample t-test for Number of clashes:
T-statistic: 223.89591635586228, P-value: 0.0

ANOVA Test¶

In [22]:
def anova_test(data, column_name):
    """
    Test if at least one group mean of `column_name` is different from the others.
    """
    group_data = [group.dropna() for name, group in data.groupby('Recursion Step')[column_name]]
    f_stat, p_val = stats.f_oneway(*group_data)
    print(f"ANOVA for {column_name}:")
    print(f"F-statistic: {f_stat}, P-value: {p_val}")


for feat in features:
    # Example usage for 'Binding Affinity (kcal/mol)' across recursion steps
    anova_test(all_data, feat)
ANOVA for Binding Affinity (kcal/mol):
F-statistic: 1.6557797274380939, P-value: 0.1572631946602122
ANOVA for Confidence Score:
F-statistic: 0.86888165238881, P-value: 0.4816376788466181
ANOVA for Strain Energy:
F-statistic: 0.13782524944989935, P-value: 0.968315439830812
ANOVA for Number of clashes:
F-statistic: 34.10450911333242, P-value: 2.2825366164770107e-28